{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Train Test Split"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Finance Historical - Features Analysis"
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "# Library\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import math\n",
        "\n",
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "\n",
        "import warnings\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "\n",
        "import fix_yahoo_finance as yf\n",
        "yf.pdr_override()"
      ],
      "outputs": [],
      "execution_count": 1,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "stock_name = 'AMD'\n",
        "start = '2010-01-01' \n",
        "end = '2019-01-01'\n",
        "df = yf.download(stock_name, start, end)\n",
        "df = df.reset_index()"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[*********************100%***********************]  1 of 1 downloaded\n"
          ]
        }
      ],
      "execution_count": 2,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "df.head()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 3,
          "data": {
            "text/plain": [
              "        Date  Open  High   Low  Close  Adj Close    Volume\n",
              "0 2010-01-04  9.79  9.90  9.68   9.70       9.70  18748700\n",
              "1 2010-01-05  9.71  9.90  9.68   9.71       9.71  22145700\n",
              "2 2010-01-06  9.68  9.76  9.55   9.57       9.57  18643400\n",
              "3 2010-01-07  9.51  9.55  9.18   9.47       9.47  26806800\n",
              "4 2010-01-08  9.37  9.47  9.29   9.43       9.43  13752800"
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Date</th>\n",
              "      <th>Open</th>\n",
              "      <th>High</th>\n",
              "      <th>Low</th>\n",
              "      <th>Close</th>\n",
              "      <th>Adj Close</th>\n",
              "      <th>Volume</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>2010-01-04</td>\n",
              "      <td>9.79</td>\n",
              "      <td>9.90</td>\n",
              "      <td>9.68</td>\n",
              "      <td>9.70</td>\n",
              "      <td>9.70</td>\n",
              "      <td>18748700</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>2010-01-05</td>\n",
              "      <td>9.71</td>\n",
              "      <td>9.90</td>\n",
              "      <td>9.68</td>\n",
              "      <td>9.71</td>\n",
              "      <td>9.71</td>\n",
              "      <td>22145700</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2010-01-06</td>\n",
              "      <td>9.68</td>\n",
              "      <td>9.76</td>\n",
              "      <td>9.55</td>\n",
              "      <td>9.57</td>\n",
              "      <td>9.57</td>\n",
              "      <td>18643400</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>2010-01-07</td>\n",
              "      <td>9.51</td>\n",
              "      <td>9.55</td>\n",
              "      <td>9.18</td>\n",
              "      <td>9.47</td>\n",
              "      <td>9.47</td>\n",
              "      <td>26806800</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>2010-01-08</td>\n",
              "      <td>9.37</td>\n",
              "      <td>9.47</td>\n",
              "      <td>9.29</td>\n",
              "      <td>9.43</td>\n",
              "      <td>9.43</td>\n",
              "      <td>13752800</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 3,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "df.shape"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 4,
          "data": {
            "text/plain": [
              "(2264, 7)"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 4,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "df.describe()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 5,
          "data": {
            "text/plain": [
              "              Open         High          Low        Close    Adj Close  \\\n",
              "count  2264.000000  2264.000000  2264.000000  2264.000000  2264.000000   \n",
              "mean      7.154302     7.306564     6.992120     7.150115     7.150115   \n",
              "std       5.182738     5.321431     5.026407     5.179124     5.179124   \n",
              "min       1.620000     1.690000     1.610000     1.620000     1.620000   \n",
              "25%       3.420000     3.467500     3.330000     3.397500     3.397500   \n",
              "50%       5.970000     6.110000     5.825000     6.000000     6.000000   \n",
              "75%       9.115000     9.320000     8.992500     9.117500     9.117500   \n",
              "max      33.180000    34.139999    32.189999    32.720001    32.720001   \n",
              "\n",
              "             Volume  \n",
              "count  2.264000e+03  \n",
              "mean   3.457876e+07  \n",
              "std    3.349831e+07  \n",
              "min    0.000000e+00  \n",
              "25%    1.418810e+07  \n",
              "50%    2.295320e+07  \n",
              "75%    4.171090e+07  \n",
              "max    3.250584e+08  "
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Open</th>\n",
              "      <th>High</th>\n",
              "      <th>Low</th>\n",
              "      <th>Close</th>\n",
              "      <th>Adj Close</th>\n",
              "      <th>Volume</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>2264.000000</td>\n",
              "      <td>2264.000000</td>\n",
              "      <td>2264.000000</td>\n",
              "      <td>2264.000000</td>\n",
              "      <td>2264.000000</td>\n",
              "      <td>2.264000e+03</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>7.154302</td>\n",
              "      <td>7.306564</td>\n",
              "      <td>6.992120</td>\n",
              "      <td>7.150115</td>\n",
              "      <td>7.150115</td>\n",
              "      <td>3.457876e+07</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>5.182738</td>\n",
              "      <td>5.321431</td>\n",
              "      <td>5.026407</td>\n",
              "      <td>5.179124</td>\n",
              "      <td>5.179124</td>\n",
              "      <td>3.349831e+07</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>1.620000</td>\n",
              "      <td>1.690000</td>\n",
              "      <td>1.610000</td>\n",
              "      <td>1.620000</td>\n",
              "      <td>1.620000</td>\n",
              "      <td>0.000000e+00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>3.420000</td>\n",
              "      <td>3.467500</td>\n",
              "      <td>3.330000</td>\n",
              "      <td>3.397500</td>\n",
              "      <td>3.397500</td>\n",
              "      <td>1.418810e+07</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>5.970000</td>\n",
              "      <td>6.110000</td>\n",
              "      <td>5.825000</td>\n",
              "      <td>6.000000</td>\n",
              "      <td>6.000000</td>\n",
              "      <td>2.295320e+07</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>9.115000</td>\n",
              "      <td>9.320000</td>\n",
              "      <td>8.992500</td>\n",
              "      <td>9.117500</td>\n",
              "      <td>9.117500</td>\n",
              "      <td>4.171090e+07</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>33.180000</td>\n",
              "      <td>34.139999</td>\n",
              "      <td>32.189999</td>\n",
              "      <td>32.720001</td>\n",
              "      <td>32.720001</td>\n",
              "      <td>3.250584e+08</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 5,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "df.columns"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 6,
          "data": {
            "text/plain": [
              "Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 6,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "df = df.drop(['Date'], axis=1)\n",
        "df.head()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 7,
          "data": {
            "text/plain": [
              "   Open  High   Low  Close  Adj Close    Volume\n",
              "0  9.79  9.90  9.68   9.70       9.70  18748700\n",
              "1  9.71  9.90  9.68   9.71       9.71  22145700\n",
              "2  9.68  9.76  9.55   9.57       9.57  18643400\n",
              "3  9.51  9.55  9.18   9.47       9.47  26806800\n",
              "4  9.37  9.47  9.29   9.43       9.43  13752800"
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Open</th>\n",
              "      <th>High</th>\n",
              "      <th>Low</th>\n",
              "      <th>Close</th>\n",
              "      <th>Adj Close</th>\n",
              "      <th>Volume</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>9.79</td>\n",
              "      <td>9.90</td>\n",
              "      <td>9.68</td>\n",
              "      <td>9.70</td>\n",
              "      <td>9.70</td>\n",
              "      <td>18748700</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>9.71</td>\n",
              "      <td>9.90</td>\n",
              "      <td>9.68</td>\n",
              "      <td>9.71</td>\n",
              "      <td>9.71</td>\n",
              "      <td>22145700</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>9.68</td>\n",
              "      <td>9.76</td>\n",
              "      <td>9.55</td>\n",
              "      <td>9.57</td>\n",
              "      <td>9.57</td>\n",
              "      <td>18643400</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>9.51</td>\n",
              "      <td>9.55</td>\n",
              "      <td>9.18</td>\n",
              "      <td>9.47</td>\n",
              "      <td>9.47</td>\n",
              "      <td>26806800</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>9.37</td>\n",
              "      <td>9.47</td>\n",
              "      <td>9.29</td>\n",
              "      <td>9.43</td>\n",
              "      <td>9.43</td>\n",
              "      <td>13752800</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 7,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.linear_model import LinearRegression\n",
        "\n",
        "data = df.copy()\n",
        "target = data.pop('Adj Close')\n",
        "\n",
        "lr = LinearRegression(fit_intercept=True)\n",
        "lr.fit(data, target)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 8,
          "data": {
            "text/plain": [
              "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n",
              "         normalize=False)"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 8,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import mean_squared_error\n",
        "\n",
        "print(\"R^2:\", lr.score(data, target)) \n",
        "\n",
        "predictions = lr.predict(data)\n",
        "mse = mean_squared_error(target, predictions)\n",
        "rmse = np.sqrt(mse)\n",
        "print(\"RMSE:\", rmse)"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "R^2: 1.0\n",
            "RMSE: 1.82037353877e-13\n"
          ]
        }
      ],
      "execution_count": 9,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Train & Test set Split\n",
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "X_train, X_test, y_train, y_test = train_test_split(data, target, shuffle=True,\n",
        "                                                    test_size=0.5, random_state=49)"
      ],
      "outputs": [],
      "execution_count": 10,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Fitting Model on Training Set\n",
        "lr_split = LinearRegression(fit_intercept=True)\n",
        "lr_split.fit(X_train, y_train)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 11,
          "data": {
            "text/plain": [
              "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n",
              "         normalize=False)"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 11,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# In-Sample Set\n",
        "from sklearn.metrics import mean_squared_error\n",
        "model = LinearRegression(fit_intercept=True)\n",
        "model.fit(X_train, y_train)\n",
        "predictions = model.predict(X_train)\n",
        "mse = mean_squared_error(y_train, predictions)\n",
        "rmse = np.sqrt(mse)\n",
        "accuracy = model.score(X_train, y_train)\n",
        "\n",
        "print('In-Sample Set')\n",
        "print('MSE:', mse)\n",
        "print('RMSE:', rmse)\n",
        "print('Accuracy on X_train & y_train:', accuracy)"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "In-Sample Set\n",
            "MSE: 2.53134444301e-26\n",
            "RMSE: 1.59101993797e-13\n",
            "Accuracy on X_train & y_train: 1.0\n"
          ]
        }
      ],
      "execution_count": 12,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "model = LinearRegression(fit_intercept=True)\n",
        "model.fit(X_test, y_test)\n",
        "predictions = model.predict(X_test)\n",
        "mse = mean_squared_error(y_test, predictions)\n",
        "rmse = np.sqrt(mse)\n",
        "accuracy = model.score(X_test, y_test)\n",
        "\n",
        "print('Out-of-Sample Set')\n",
        "print('MSE:', mse)\n",
        "print('RMSE:', rmse)\n",
        "print('Accuracy on X_test & y_test:', accuracy)"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Out-of-Sample Set\n",
            "MSE: 3.90252316071e-26\n",
            "RMSE: 1.97548048857e-13\n",
            "Accuracy on X_test & y_test: 1.0\n"
          ]
        }
      ],
      "execution_count": 13,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def calc_ISE(X_train, y_train, model):\n",
        "    '''returns the in-sample R^2 and RMSE; assumes model already fit.'''\n",
        "    predictions = model.predict(X_train)\n",
        "    mse = mean_squared_error(y_train, predictions)\n",
        "    rmse = np.sqrt(mse)\n",
        "    return model.score(X_train, y_train), rmse\n",
        "    \n",
        "def calc_OSE(X_test, y_test, model):\n",
        "    '''returns the out-of-sample R^2 and RMSE; assumes model already fit.'''\n",
        "    predictions = model.predict(X_test)\n",
        "    mse = mean_squared_error(y_test, predictions)\n",
        "    rmse = np.sqrt(mse)\n",
        "    return model.score(X_test, y_test), rmse"
      ],
      "outputs": [],
      "execution_count": 14,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "is_r2, ise = calc_ISE(X_train, y_train, lr_split)\n",
        "os_r2, ose = calc_OSE(X_test, y_test, lr_split)\n",
        "\n",
        "# show dataset sizes\n",
        "data_list = (('R^2_in', is_r2), ('R^2_out', os_r2), \n",
        "             ('ISE', ise), ('OSE', ose))\n",
        "for item in data_list:\n",
        "    print('{:10}: {}'.format(item[0], item[1]))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "R^2_in    : 1.0\n",
            "R^2_out   : 1.0\n",
            "ISE       : 1.5910199379664295e-13\n",
            "OSE       : 1.6411248849954045e-13\n"
          ]
        }
      ],
      "execution_count": 15,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# create array of random_state values\n",
        "random_states = np.random.randint(1, 100, size=5)\n",
        "random_states\n",
        "\n",
        "for random_state in random_states:\n",
        "    # split data according to random state\n",
        "    X_train, X_test, y_train, y_test = train_test_split(data, target, \n",
        "                                                        shuffle=True,\n",
        "                                                        test_size=0.5, \n",
        "                                                        random_state=random_state)\n",
        "    # instantiate mmodel\n",
        "    lr = LinearRegression(fit_intercept=True)\n",
        "    # fit model\n",
        "    lr.fit(X_train, y_train)\n",
        "    # capture key metrics\n",
        "    is_r2, ise = calc_ISE(X_train, y_train, lr)\n",
        "    os_r2, ose = calc_OSE(X_test, y_test, lr)\n",
        "    # round values\n",
        "    is_r2, os_r2 = round(is_r2, 4), round(os_r2, 4)\n",
        "    ise, ose = round(ise, 4), round(ose, 4)\n",
        "    \n",
        "    # print key metrics\n",
        "    print('Random State: {}'.format(random_state))\n",
        "    print('IS_R^2: {} | IS_RMSE: {}'.format(is_r2, ise))\n",
        "    print('OS_R^2: {} | OS_RMSE: {}'.format(os_r2, ose))\n",
        "    print('-'*34)"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Random State: 1\n",
            "IS_R^2: 1.0 | IS_RMSE: 0.0\n",
            "OS_R^2: 1.0 | OS_RMSE: 0.0\n",
            "----------------------------------\n",
            "Random State: 17\n",
            "IS_R^2: 1.0 | IS_RMSE: 0.0\n",
            "OS_R^2: 1.0 | OS_RMSE: 0.0\n",
            "----------------------------------\n",
            "Random State: 58\n",
            "IS_R^2: 1.0 | IS_RMSE: 0.0\n",
            "OS_R^2: 1.0 | OS_RMSE: 0.0\n",
            "----------------------------------\n",
            "Random State: 28\n",
            "IS_R^2: 1.0 | IS_RMSE: 0.0\n",
            "OS_R^2: 1.0 | OS_RMSE: 0.0\n",
            "----------------------------------\n",
            "Random State: 13\n",
            "IS_R^2: 1.0 | IS_RMSE: 0.0\n",
            "OS_R^2: 1.0 | OS_RMSE: 0.0\n",
            "----------------------------------\n"
          ]
        }
      ],
      "execution_count": 16,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    }
  ],
  "metadata": {
    "kernel_info": {
      "name": "python3"
    },
    "language_info": {
      "nbconvert_exporter": "python",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "version": 3,
        "name": "ipython"
      },
      "pygments_lexer": "ipython3",
      "version": "3.5.5",
      "file_extension": ".py",
      "name": "python"
    },
    "kernelspec": {
      "name": "python3",
      "language": "python",
      "display_name": "Python 3"
    },
    "nteract": {
      "version": "0.12.2"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}